(Created in June 2019)
This notebook is dependent on the following libraries
import dimcli
from dimcli.shortcuts import dslquery, dslqueryall
import pandas as pd
from pandas.io.json import json_normalize
import plotly_express as px
from plotly.offline import init_notebook_mode # needed for exports
init_notebook_mode(connected=True)
import time
GRIDID = "grid.258518.3"
YEAR_START = "2009"
df_all_pubs_per_year = dslquery(f"""search publications where research_orgs.id="{GRIDID}"return year limit 1000""").as_dataframe()
pubs_details = dslqueryall(f"""search publications where research_orgs.id="{GRIDID}" and year>="{YEAR_START}" return publications[basics+FOR]""")
df_pubs_details = pubs_details.as_dataframe()
Create a new list of publications with simplifed FOR data
# ensure that all pubs have a valid (empty, even) FOR value
# also remove the FOR digit prefix to improve legibility
for x in pubs_details.publications:
if not 'FOR' in x:
x['FOR'] = ""
else:
x['FOR'] = [{'name' : x['name'][5:]} for x in x['FOR']]
df_pubs_for = json_normalize(pubs_details.publications, record_path=['FOR'], meta=["id", "type", ["journal", "title"], "year"], errors='ignore', record_prefix='for_')
Get the patents infos using the publications
# query structure is:
# d=dslquery(f"""search patents where publication_ids in ["pub.1111511314","pub.1113174788","pub.1111902055"] return patents limit 1000""")
from itertools import islice
def chunks_of(data, size):
it = iter(data)
chunk = list(islice(it, size))
while chunk:
yield chunk
chunk = list(islice(it, size))
SIZE = 400
def run(ids_list):
patents_out, n = [], 0
for chunk in chunks_of(ids_list, SIZE): # chunks of 200 args
n += 1
temp = ','.join(['"{}"'.format(i) for i in chunk])
data = dslquery(f"""search patents where publication_ids in [{temp}] return patents[basics+publication_ids+FOR] limit 1000""")
patents_out += data.patents
print("[log] ", n*SIZE, " pubs > patents: ", len(data.patents))
time.sleep(1)
return patents_out
patents_list = run(list(df_pubs_details['id']))
After going through all publications and extracting related patents, let's save the patents data so that we can use it later:
df_patent_details = pd.DataFrame().from_dict(patents_list)
# save to CSV
df_patent_details.to_csv("data/KSU_patents_by_id.csv")
# display top 3 rows
df_patent_details.head(3)
We also want to normalise the assignees data in order to analyse it further later on (ps: first we need to make the assigness data structure more regular)
for x in patents_list:
if not 'assignees' in x:
x['assignees'] = []
df_patents_assignees = json_normalize(patents_list, record_path=['assignees'], meta=['id', 'year', 'title'], meta_prefix="grant_")
# save to CSV
df_patents_assignees.to_csv("data/KSU_patents_by_assignees.csv")
df_patents_assignees.head()
Let's do the same type of normalization for FOR codes
for x in patents_list:
if not 'FOR' in x:
x['FOR'] = ""
else:
x['FOR'] = [{'name' : x['name'][5:]} for x in x['FOR']]
df_patents_for = json_normalize(patents_list, record_path=['FOR'], meta=["id", "year", "title"], errors='ignore', record_prefix='for_')
# save to CSV
df_patents_for.to_csv("data/KSU_patents_by_FOR.csv")
df_patents_for.head()
Finally, let's create another publications index, including only the KSU publications cited by the patents extracted above.
# extract pubs from patents list
pubs_referenced_from_patents = []
for x in patents_list:
if x['publication_ids']:
pubs_referenced_from_patents += x['publication_ids']
# remove duplicates
pubs_referenced_from_patents = list(set(pubs_referenced_from_patents))
len(pubs_referenced_from_patents)
Intersect list of publications from KSU with list of publications mentioned in patents
df_linked_pubs = df_pubs_details[df_pubs_details['id'].isin(pubs_referenced_from_patents)]
df_linked_pubs.reset_index(drop=True)
# save to CSV
df_linked_pubs.to_csv("data/KSU_pubs_linked_to_patents.csv")
df_linked_pubs.head()
Also, create a version of df_linked_pubs with simplified FOR codes so that it's easier to visualise.
df_linked_pubs_for = df_pubs_for[df_pubs_for['id'].isin(pubs_referenced_from_patents)]
df_linked_pubs_for.reset_index(drop=True)
df_linked_pubs_for.head()
px.bar(df_all_pubs_per_year, x="id", y="count", title="Total Publications per year from KSU")
px.scatter(df_pubs_for, x="year", y="for_name", color="type", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of pubs in last 10 years (marginal subplots = X/Y totals)")
px.bar(df_linked_pubs.groupby('year', as_index=False).count(), x="year", y="id", title="Publications mentioned in patents, by year of publication")
px.scatter(df_linked_pubs_for, x="year", y="for_name", color="type", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of pubs mentioned in patents (marginal subplots = X/Y totals)")
px.bar(df_patent_details.groupby('year', as_index=False).count(), x="year", y="id", title="Patents per Year citing publications from KSU")
px.scatter(df_patents_for, x="year", y="for_name", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of patents (marginal subplots = X/Y totals)")
px.scatter(df_patent_details, x="year", y="times_cited", hover_name="title", hover_data=['id'], facet_col="filing_status", title="Patents per Year VS Timed Cited VS Filing Status")
px.bar(df_patents_assignees.groupby('name', as_index=False).count().sort_values(by="grant_id", ascending=False), x="name", y="grant_id", hover_name="name", height=400, title="Assignees by No of Patents")
px.scatter(df_patents_assignees, x="grant_year", y="name", color="country_name", hover_name="name", hover_data=["id"], height=800, title="Assignees By Country and Year")